This notebook accompanies the user guide “Model Comparison and Calibration Assessment” on arXiv.
The code is similar to the one used in the user guide. Please refer to it for explanations.
Note that the results might vary depending on the R package versions.
The regression example is based on the “Workers Compensation” claims dataset on OpenML. We will model the ultimate claim amount of claims based on their initial case reserves and other claim features.
The dataset is being downloaded and then stored on disk for reuse.
library(tidyverse)
library(lubridate)
library(hexbin)
library(MetricsWeighted)
library(splines)
library(xgboost)
library(arrow)
library(ggcorrplot)
library(patchwork)
library(scales)
library(glm2)
library(withr)
# Workers Compensation dataset, see
# https://www.openml.org/d/42876
if (!file.exists("workers_compensation.parquet")) {
library(OpenML)
df_origin <- getOMLDataSet(data.id = 42876L)
df <- tibble(df_origin$data)
write_parquet(df, "workers_compensation.parquet")
} else {
df <- read_parquet("workers_compensation.parquet")
}# Note: WeekDayOfAccident: 1 means Monday
# Note: We filter out rows with WeeklyPay < 200
df <- df %>%
filter(WeeklyPay >= 200, HoursWorkedPerWeek >= 20) %>%
mutate(
DateTimeOfAccident = ymd_hms(DateTimeOfAccident),
DateOfAccident = as_date(DateTimeOfAccident),
DateReported = ymd(DateReported),
LogDelay = log1p(as.numeric(DateReported - DateOfAccident)),
HourOfAccident = hour(DateTimeOfAccident),
WeekDayOfAccident = factor(wday(DateOfAccident, week_start = 1)),
LogWeeklyPay = log1p(WeeklyPay),
LogInitial = log(InitialCaseEstimate),
# DependentsOther = as.numeric(DependentsOther >= 1),
DependentChildren = pmin(4, DependentChildren),
HoursWorkedPerWeek = pmin(60, HoursWorkedPerWeek)
) %>%
mutate_at(c("Gender", "MaritalStatus", "PartTimeFullTime"), as.factor) %>%
rename(HoursPerWeek = HoursWorkedPerWeek)
x_continuous <- c("Age", "LogWeeklyPay", "LogInitial", "HourOfAccident",
"HoursPerWeek", "LogDelay")
x_discrete <- c("Gender", "MaritalStatus", "PartTimeFullTime",
"DependentChildren", "DaysWorkedPerWeek", "WeekDayOfAccident")
x_vars <- c(x_continuous, x_discrete)
y_var <- "UltimateIncurredClaimCost"
df %>%
select(all_of(y_var), all_of(x_vars)) %>%
print(n = 10, width = 80)## # A tibble: 82,017 x 13
## UltimateIncurredCl… Age LogWeeklyPay LogInitial HourOfAccident HoursPerWeek
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 102. 45 6.22 9.16 9 38
## 2 1451 40 5.65 8.01 15 38
## 3 320. 50 6.25 6.91 7 38
## 4 108 19 5.30 4.70 14 38
## 5 7111. 19 6.64 9.18 14 40
## 6 8379. 21 5.30 9.16 12 37
## 7 25338. 50 6.79 11.2 0 38
## 8 354 25 5.83 7.70 8 35
## 9 2269. 20 5.51 8.01 11 40
## 10 129367. 43 7.55 9.16 16 40
## # … with 82,007 more rows, and 7 more variables: LogDelay <dbl>, Gender <fct>,
## # MaritalStatus <fct>, PartTimeFullTime <fct>, DependentChildren <dbl>,
## # DaysWorkedPerWeek <dbl>, WeekDayOfAccident <fct>
df_values <- data.frame(
functional = c("mean", "median"),
value = c(mean(df[[y_var]]), median(df[[y_var]]))
)
ggplot(df, aes(x = UltimateIncurredClaimCost)) +
geom_histogram(aes(y = ..density..), bins = 200, fill = "#E69F00") +
geom_vline(
data = df_values,
aes(xintercept = value, color = functional),
linetype = "dashed",
size = 1
) +
scale_x_log10() +
xlab("Log(UltimateIncurredClaimCost)") +
ggtitle("Histogram of UltimateIncurredClaimCost")df %>%
select_at(x_continuous) %>%
pivot_longer(everything()) %>%
mutate(name = factor(name, levels = x_continuous)) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 19, fill = "#E69F00") +
facet_wrap(~name, scales = "free", ncol = 3) +
labs(y = element_blank()) +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
) +
ggtitle("Histograms of numerical features")df %>%
select_at(x_discrete) %>%
mutate_all(as.character) %>%
pivot_longer(everything()) %>%
mutate(
name = factor(name, levels = x_discrete),
value = factor(value)
) %>%
ggplot(aes(x = value)) +
geom_bar(fill = "#E69F00") +
facet_wrap(~name, scales = "free", ncol = 3) +
labs(y = "Count") +
ggtitle("Histograms of categorical features")df %>%
select_at(x_continuous) %>%
cor() %>%
round(2) %>%
ggcorrplot(
hc.order = FALSE,
type = "upper",
outline.col = "white",
ggtheme = theme_minimal(),
colors = c("#6D9EC1", "white", "#E46726")
)df_cat <- df %>%
select(all_of(x_discrete), all_of(y_var)) %>%
mutate(across(-all_of(y_var), as.factor)) %>%
pivot_longer(cols = -all_of(y_var))
ggplot(df_cat, aes_string("value", y_var)) +
geom_boxplot(varwidth = TRUE, fill = "orange") +
facet_wrap(~ name, scales = "free_x") +
scale_y_log10() +
xlab(element_blank()) +
ggtitle("Boxplots for categorical features")df_num <- df %>%
select(all_of(x_continuous), all_of(y_var)) %>%
pivot_longer(cols = -all_of(y_var))
ggplot(df_num, aes_string("value", y = y_var)) +
geom_hex(bins = 18, show.legend = FALSE) +
facet_wrap(~ name, scales = "free") +
scale_y_log10() +
scale_fill_viridis_c(option = "magma", trans = "log10") +
xlab(element_blank())Next, we split our dataset into training and test data.
set.seed(1234321L)
.in <- sample(nrow(df), round(0.75 * nrow(df)), replace = FALSE)
train <- df[.in, ]
test <- df[-.in, ]
y_train <- train[[y_var]]
y_test <- test[[y_var]]
df <- df %>%
mutate(dataset = factor(c("test"), c("train", "test")))
df$dataset[.in] <- "train"
df %>%
group_by(dataset) %>%
summarise(
mean = mean(UltimateIncurredClaimCost),
q20 = quantile(UltimateIncurredClaimCost, probs=0.2),
q40 = quantile(UltimateIncurredClaimCost, probs=0.4),
q50 = median(UltimateIncurredClaimCost),
q60 = quantile(UltimateIncurredClaimCost, probs=0.6),
q80 = quantile(UltimateIncurredClaimCost, probs=0.8),
q90 = quantile(UltimateIncurredClaimCost, probs=0.9)
)trivial_predict <- function(X) {
mean(y_train) * rep(1, dim(X)[1])
}form <- reformulate(x_vars, y_var)
fit_ols <- lm(form, data = train)
summary(fit_ols)##
## Call:
## lm(formula = form, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59570 -17197 -7636 1715 3098226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94262.309 4289.601 -21.975 < 2e-16 ***
## Age 229.202 21.032 10.898 < 2e-16 ***
## LogWeeklyPay 6641.731 484.474 13.709 < 2e-16 ***
## LogInitial 7141.915 141.047 50.635 < 2e-16 ***
## HourOfAccident -81.457 58.319 -1.397 0.162493
## HoursPerWeek -8.964 52.903 -0.169 0.865449
## LogDelay 1388.656 265.576 5.229 1.71e-07 ***
## GenderM -3241.978 563.400 -5.754 8.74e-09 ***
## MaritalStatusS -1214.311 540.533 -2.247 0.024675 *
## MaritalStatusU -887.372 828.146 -1.072 0.283941
## PartTimeFullTimeP 3254.703 1120.526 2.905 0.003678 **
## DependentChildren 1169.777 452.950 2.583 0.009809 **
## DaysWorkedPerWeek 1101.599 623.100 1.768 0.077077 .
## WeekDayOfAccident2 -2709.643 740.163 -3.661 0.000252 ***
## WeekDayOfAccident3 -3303.039 736.864 -4.483 7.39e-06 ***
## WeekDayOfAccident4 -3560.326 739.063 -4.817 1.46e-06 ***
## WeekDayOfAccident5 -3909.812 739.677 -5.286 1.26e-07 ***
## WeekDayOfAccident6 -1306.407 1021.934 -1.278 0.201125
## WeekDayOfAccident7 -3696.886 1240.245 -2.981 0.002876 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54970 on 61494 degrees of freedom
## Multiple R-squared: 0.06627, Adjusted R-squared: 0.066
## F-statistic: 242.5 on 18 and 61494 DF, p-value: < 2.2e-16
r_squared(y_test, predict(fit_ols, test), reference_mean = mean(y_train)) ## [1] 0.06800488
# r_squared_gamma(y_test, predict(fit_ols, test), reference_mean = mean(y_train))
# Error because of negative predictions
fit_ols_log <- lm(update(form, log(UltimateIncurredClaimCost) ~ .), data = train)
ols_predict <- function(X){
exp(predict(fit_ols_log, X))
}
# Same with bias correction
corr_fact_ols <- mean(y_train) / mean(exp(fitted(fit_ols_log)))
ols_corr_predict <- function(X) {
corr_fact_ols * exp(predict(fit_ols_log, X))
}# Note: Standard glm(..) raises
# Error in glm.fit(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, :
# NA/NaN/Inf in 'x'
# Therefore, we use glm2 which is more stable.
fit_glm_gamma <- glm2(reformulate(x_vars, y_var), data = train,
family = Gamma(link = "log"))
summary(fit_glm_gamma)##
## Call:
## glm2(formula = reformulate(x_vars, y_var), family = Gamma(link = "log"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.763 -1.957 -1.499 -0.854 51.614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.028350 0.613185 -0.046 0.963125
## Age 0.021583 0.003007 7.179 7.11e-13 ***
## LogWeeklyPay 0.670881 0.069254 9.687 < 2e-16 ***
## LogInitial 0.482337 0.020162 23.923 < 2e-16 ***
## HourOfAccident 0.006094 0.008336 0.731 0.464767
## HoursPerWeek 0.001041 0.007562 0.138 0.890509
## LogDelay 0.134833 0.037963 3.552 0.000383 ***
## GenderM -0.347941 0.080536 -4.320 1.56e-05 ***
## MaritalStatusS -0.153744 0.077268 -1.990 0.046622 *
## MaritalStatusU 0.122295 0.118381 1.033 0.301579
## PartTimeFullTimeP 0.204716 0.160176 1.278 0.201230
## DependentChildren 0.152436 0.064748 2.354 0.018561 *
## DaysWorkedPerWeek 0.030446 0.089070 0.342 0.732487
## WeekDayOfAccident2 -0.092122 0.105804 -0.871 0.383928
## WeekDayOfAccident3 -0.277486 0.105332 -2.634 0.008431 **
## WeekDayOfAccident4 -0.254000 0.105647 -2.404 0.016209 *
## WeekDayOfAccident5 -0.258506 0.105735 -2.445 0.014494 *
## WeekDayOfAccident6 -0.198524 0.146082 -1.359 0.174156
## WeekDayOfAccident7 -0.093022 0.177289 -0.525 0.599802
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 61.75326)
##
## Null deviance: 312399 on 61512 degrees of freedom
## Residual deviance: 225203 on 61494 degrees of freedom
## AIC: 1138728
##
## Number of Fisher Scoring iterations: 12
glm_gamma_predict <- function(X) {
predict(fit_glm_gamma, X, type = "response")
}
# Same with bias correction
corr_fact_glm <- mean(y_train) / mean(glm_gamma_predict(train))
glm_gamma_corr_predict <- function(X){
corr_fact_glm * predict(fit_glm_gamma, X, type = "response")
}# Quasi-Poisson instead of Poisson suppresses warnings for non-integer y
fit_glm_poisson <- glm(
reformulate(x_vars, y_var), data = train, family = quasipoisson
)
summary(fit_glm_poisson)##
## Call:
## glm(formula = reformulate(x_vars, y_var), family = quasipoisson,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -948.6 -116.2 -65.4 -35.7 5171.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.381284 0.294657 1.294 0.195672
## Age 0.013423 0.001475 9.098 < 2e-16 ***
## LogWeeklyPay 0.501197 0.036604 13.692 < 2e-16 ***
## LogInitial 0.655883 0.012763 51.390 < 2e-16 ***
## HourOfAccident 0.002261 0.004030 0.561 0.574840
## HoursPerWeek -0.004563 0.003310 -1.379 0.167981
## LogDelay 0.057101 0.019625 2.910 0.003619 **
## GenderM -0.255144 0.039523 -6.456 1.09e-10 ***
## MaritalStatusS -0.176509 0.039997 -4.413 1.02e-05 ***
## MaritalStatusU -0.106357 0.053720 -1.980 0.047726 *
## PartTimeFullTimeP 0.122453 0.072032 1.700 0.089138 .
## DependentChildren 0.053915 0.028606 1.885 0.059466 .
## DaysWorkedPerWeek 0.026961 0.038600 0.698 0.484880
## WeekDayOfAccident2 -0.193843 0.053965 -3.592 0.000328 ***
## WeekDayOfAccident3 -0.233027 0.054652 -4.264 2.01e-05 ***
## WeekDayOfAccident4 -0.262282 0.054996 -4.769 1.85e-06 ***
## WeekDayOfAccident5 -0.274203 0.055754 -4.918 8.76e-07 ***
## WeekDayOfAccident6 -0.098053 0.073250 -1.339 0.180700
## WeekDayOfAccident7 -0.263446 0.095347 -2.763 0.005728 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 228093.6)
##
## Null deviance: 3504504976 on 61512 degrees of freedom
## Residual deviance: 2421145448 on 61494 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 7
glm_poisson_predict <- function(X){
predict(fit_glm_poisson, X, type = "response")
}Resonable choices for XGBoost’s hyperparameters are provided in the grid search table stored as “grid/grid_xgb.RData”. Thus, there is no need for retuning those parameters.
# Data interface
dtrain <- xgb.DMatrix(data.matrix(train[, x_vars]), label = y_train)
# Settings
tune <- FALSE
file_grid <- "grid/grid_xgb.RData"
if (tune) {
# Step 1: find good learning rate
xgb.cv(
list(learning_rate = 0.03),
dtrain,
nrounds = 5000,
tree_method = "hist",
nfold = 5,
objective = "reg:gamma",
showsd = FALSE,
early_stopping_rounds = 20,
verbose = 2
)
# Step 2: Grid search CV on typical parameter combos
paramGrid <- expand.grid(
iteration = NA,
score = NA,
learning_rate = 0.03,
max_depth = 4:6,
min_child_weight = c(0, 1e-04),
colsample_bynode = c(0.8, 1),
subsample = c(0.8, 1),
reg_lambda = 0:2,
reg_alpha = 0:2,
tree_method = "hist",
eval_metric = "gamma-nloglik"
)
if (nrow(paramGrid) > 20) {
set.seed(342267)
paramGrid <- paramGrid[sample(nrow(paramGrid), 20), ]
}
for (i in seq_len(nrow(paramGrid))) {
print(i)
cvm <- xgb.cv(
as.list(paramGrid[i, -(1:2)]),
dtrain,
nrounds = 5000,
nfold = 5,
objective = "reg:gamma",
showsd = FALSE,
early_stopping_rounds = 20,
verbose = 0
)
paramGrid[i, 1] <- bi <- cvm$best_iteration
paramGrid[i, 2] <- cvm$evaluation_log[bi, "test_gamma_nloglik_mean"] %>%
as.numeric()
save(paramGrid, file = file_grid)
}
}
load(file_grid, verbose = TRUE)## Loading objects:
## paramGrid
# Step 3: Fit on best params
head(paramGrid <- paramGrid[order(paramGrid$score), ])params <- paramGrid[1, ]
set.seed(93845)
fit_xgb <- xgb.train(
as.list(params[, -(1:2)]),
data = dtrain,
nrounds = params$iteration,
objective = "reg:gamma"
)
# Predict wrapper for xgb model
xgb_predict <- function(newdata, x = x_vars) {
predict(fit_xgb, data.matrix(newdata[, x]))
}
# Same with bias correction
corr_fact_xgb <- mean(y_train) / mean(xgb_predict(train))
xgb_corr_predict <- function(newdata, x = x_vars) {
corr_fact_xgb * predict(fit_xgb, data.matrix(newdata[, x]))
}
# feature importance, helps to select features for calibration analysis
importanceRaw <- xgb.importance(model = fit_xgb)
xgb.plot.importance(importance_matrix = importanceRaw)model_levels <- c("trivial", "ols", "ols_corr", "glm_gamma",
"glm_gamma_corr", "glm_poisson", "xgb", "xgb_corr")
df_cali <- df %>%
mutate(
prediction_trivial = trivial_predict(.),
prediction_ols = ols_predict(.),
prediction_ols_corr = ols_corr_predict(.),
prediction_glm_gamma = glm_gamma_predict(.),
prediction_glm_gamma_corr = glm_gamma_corr_predict(.),
prediction_glm_poisson = glm_poisson_predict(.),
prediction_xgb = xgb_predict(.),
prediction_xgb_corr = xgb_corr_predict(.)
) %>%
pivot_longer(
cols = starts_with("prediction"),
names_to = "model",
names_prefix = "prediction_",
values_to = "prediction"
) %>%
mutate(
bias = prediction - UltimateIncurredClaimCost,
model = factor(model, model_levels)
)
# Unconditional calibration
df_cali %>%
group_by(dataset, model) %>%
summarise(
mean_bias = num(mean(bias), digits = 0),
p_value = num(t.test(bias)$p.value, sigfig = 2),
.groups = "drop"
)Visualized differently on train and test.
df_cali %>%
filter(dataset == "train") %>%
# sample_n(10000) %>%
ggplot(aes(x = prediction, y = bias)) +
geom_hex(bins = 23, show.legend = TRUE) +
geom_smooth(
method = "gam",
formula = y ~ s(x, bs = "cr", k = 20),
se = FALSE
) +
scale_fill_viridis_c(option = "magma", trans = "log10") +
facet_wrap(~ model) +
theme(legend.position = c(0.9, 0), legend.justification = c(1, 0)) +
ggtitle("Bias (negative residuals) vs predicted (training set)")# Smoothed curve on test
df_cali %>%
filter(dataset == "test", prediction <= 1e5) %>%
ggplot(aes(x = prediction, y = bias, linetype = model, color = model)) +
geom_smooth(
method = "gam",
formula = y ~ s(x, bs = "cr", k = 20),
se = FALSE
) +
coord_cartesian(ylim = c(-2e4, 2e4)) +
ggtitle("Bias (negative residuals) vs predicted (test set)")# Table for test function phi = prediction
df_cali %>%
group_by(dataset, model) %>%
summarise(
bias = num(mean(prediction * bias), sigfig = 4, notation = "sci"),
.groups = "drop"
) %>%
pivot_wider(names_from = dataset, values_from = bias)# Note: We want to divide by n_train and n_test and NOT by n(Gender=M) and n(Gender=F)
df_cali %>%
group_by(dataset, model) %>%
summarise(
F = mean(bias * (Gender == "F")),
M = mean(bias * (Gender == "M")),
.groups = "drop"
)Bar plot of mean bias on the test set. The first plot for the two test functions \(\phi(x) = 1\{Gender = F\}\) and \(\phi(x) = 1\{Gender = M\}\) and the second one for evaluation of \(\bar V\) stratified by gender.
df_indicator <- df_cali %>%
filter(dataset == "test") %>%
group_by(model) %>%
summarise(
F = mean(bias * (Gender == "F")),
M = mean(bias * (Gender == "M")),
.groups = "drop"
) %>%
pivot_longer(c("F", "M"), names_to = "Gender", values_to = "mean bias") %>%
mutate(type = "whole sample")
df_strat <- df_cali %>%
filter(dataset == "test") %>%
group_by(model, Gender) %>%
summarise(`mean bias` = mean(bias), .groups = "drop") %>%
mutate(type = "subsample per Gender")
df_indicator %>%
bind_rows(df_strat) %>%
ggplot(aes(x = Gender, y = `mean bias`, group = model, color = model)) +
geom_point(size = 3, position = position_dodge(width = 0.3), alpha = 0.7) +
facet_wrap(~ fct_rev(type)) +
ggtitle("Mean bias by Gender (test set)")LogWeeklyPay is the second most important feature after the initial claim reserve estimate.
# Binning of LogWeeklyPay in 10 quantiles
# Use mean of quantile as x-value
breaks <- unique(quantile(df$LogWeeklyPay, (0:10) / 10))
midpoints <- (breaks[-length(breaks)] + breaks[-1]) / 2
df_binned <- df_cali %>%
mutate(
LogWeeklyPayBinned = midpoints[
cut(LogWeeklyPay, breaks, include.lowest = TRUE, labels = FALSE)
],
WeeklyPayBinned = exp(LogWeeklyPayBinned) - 1
)
p1 <- df_binned %>%
group_by(dataset, model, WeeklyPayBinned) %>%
summarise(bias = sum(bias), .groups = "drop") %>%
mutate(
bias = bias / case_when(
dataset=="train" ~ nrow(train),
dataset=="test" ~ nrow(test)
)
) %>%
ggplot(aes(x = WeeklyPayBinned, y = bias, color = model, group = model)) +
geom_line() +
geom_point() +
scale_x_log10() +
facet_wrap(~ dataset) +
ylab("mean bias") +
ggtitle("Mean bias on whole sample") +
theme(plot.title = element_text(size = 12))
p2 <- df_binned %>%
group_by(dataset, model, WeeklyPayBinned) %>%
summarise(bias = mean(bias), .groups = "drop") %>%
ggplot(aes(x = WeeklyPayBinned, y = bias, color = model, group = model)) +
geom_line() +
geom_point() +
scale_x_log10() +
facet_wrap(~ dataset) +
ylab("mean bias") +
ggtitle("Mean bias per sample in bin") +
theme(plot.title = element_text(size = 12))
p1 / p2 +
plot_layout(guides = "collect") +
plot_annotation(title = "Mean bias on binned LogWeeklyPay")# Test function = LogWeeklyPay
with_options(
list(scipen = 3, pillar.sigfig = 0),
df_cali %>%
group_by(dataset, model) %>%
summarise(calib_test = mean(LogWeeklyPay * bias), .groups = "drop") %>%
pivot_wider(names_from = dataset, values_from = calib_test)
)# Function returns two relevant mean Gamma deviance and corresponding D2
perf <- function(X, actual, predicted, ref) {
act <- X[[actual]]
pred <- X[[predicted]]
tibble(
Measure = c("Mean deviance", "D-Squared"),
Score = c(`Mean deviance` = deviance_gamma(act, pred),
`D-Squared` = r_squared_gamma(act, pred, reference_mean = ref))
)
}
# Apply it to combinations of dataset and model
df_perf <- df_cali %>%
group_by(dataset, model) %>%
summarize(
perf(cur_data(), y_var, "prediction", mean(y_train)),
.groups = "drop"
)
df_perf %>%
pivot_wider(
"model",
names_from = c("Measure", "dataset"),
values_from = "Score", names_sort = TRUE
)ggplot(df_perf, aes(y = Score, x = dataset, color = model, group = model)) +
geom_point(size = 2) +
geom_line() +
facet_wrap(~ Measure, scales = "free")df_murphy <- df_cali %>%
filter(!(model %in% c("trivial", "ols"))) %>%
group_by(dataset, model) %>%
summarize(
murphy_diagram(
cur_data()[[y_var]],
cur_data()$prediction,
plot = FALSE,
theta = exp(seq(log(10000), log(1.7e5), by = 0.02))
),
.groups = "drop"
) %>%
rename(Score = predicted)
# Vertical line is the mean y on the training data (for both graphs)
ggplot(df_murphy, aes(y = Score, x = theta)) +
geom_line(aes(color = model, linetype = model, group = model), size = 0.75) +
geom_vline(xintercept = mean(train[[y_var]]), linetype = 2, size = 0.75) +
facet_wrap(~ dataset, scales = "free", ncol = 2) +
scale_x_log10() +
theme(legend.title = element_blank())df_tweedie <- df_cali %>%
filter(!(model %in% c("trivial", "ols"))) %>%
group_by(dataset, model) %>%
summarize(
performance(
cur_data(),
actual = y_var,
predicted = "prediction",
metrics = multi_metric(deviance_tweedie, tweedie_p = seq(1, 3, by = 0.05)),
key = "Tweedie_p",
value = "Deviance"
),
.groups = "drop"
) %>%
mutate(
Tweedie_p = as.numeric(as.character(Tweedie_p)),
Deviance_rescaled = Deviance * mean(df_cali[[y_var]])^(Tweedie_p-2)
)
ggplot(df_tweedie, aes(y = Deviance_rescaled, x = Tweedie_p)) +
geom_line(aes(color = model, linetype = model, group = model), size = 0.75) +
facet_wrap(~ dataset, scales = "free", ncol = 2) +
scale_y_log10() +
theme(legend.title = element_blank())The classification example is based on the “Telco Customer Churn” dataset on OpenML. We will model churn probability as a function of a couple of features.
The dataset is being downloaded and then stored on disk for reuse.
library(tidyverse)
library(lubridate)
library(hexbin)
library(MetricsWeighted)
library(ranger)
library(xgboost)
library(arrow)
library(ggcorrplot)
library(patchwork)
library(scales)
library(withr)
library(splitTools)
library(reliabilitydiag)
if (!file.exists("churn.parquet")) {
library(OpenML)
df_origin <- getOMLDataSet(data.id = 42178)
df <- tibble(df_origin$data)
write_parquet(df, "churn.parquet")
} else {
df <- read_parquet("churn.parquet")
}df[df == "No internet service" | df == "No phone service"] <- "No"
df <- df %>%
mutate(
LogTotalCharges = log(as.numeric(TotalCharges)),
Churn = (Churn == "Yes") + 0,
) %>%
replace_na(
list(LogTotalCharges = median(.$LogTotalCharges, na.rm = TRUE))
) %>%
mutate_if(is.character, as.factor)
y_var <- "Churn"
x_continuous <- c("tenure", "MonthlyCharges", "LogTotalCharges")
x_discrete <- setdiff(
colnames(select_if(df, is.factor)), c("customerID", y_var, "TotalCharges")
)
x_vars <- c(x_continuous, x_discrete)
df[c(y_var, x_vars)]table(df[[y_var]], useNA = "ifany")##
## 0 1
## 5174 1869
# Univariate description
df %>%
select_at(x_continuous) %>%
pivot_longer(everything()) %>%
mutate(name = factor(name, levels = x_continuous)) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 19, fill = "#E69F00") +
facet_wrap(~name, scales = "free", ncol = 3) +
labs(y = element_blank()) +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
) +
ggtitle("Histograms of numerical features")df %>%
select_at(x_discrete) %>%
mutate_all(as.character) %>%
pivot_longer(everything()) %>%
mutate(
name = factor(name, levels = x_discrete),
value = factor(value)
) %>%
ggplot(aes(x = value)) +
geom_bar(fill = "#E69F00") +
facet_wrap(~ name, scales = "free", ncol = 3) +
labs(y = "Count") +
ggtitle("Barplots of categorical features")df %>%
select_at(x_continuous) %>%
cor() %>%
round(2) %>%
ggcorrplot(
hc.order = FALSE,
type = "upper",
outline.col = "white",
ggtheme = theme_minimal(),
colors = c("#6D9EC1", "white", "#E46726")
)# Response in dependence of covariates
df_cat <- df %>%
select(all_of(x_discrete), all_of(y_var)) %>%
mutate(across(-all_of(y_var), as.factor)) %>%
pivot_longer(cols = -all_of(y_var)) %>%
group_by(name, value) %>%
summarize(Churn = mean(Churn), .groups="drop")
ggplot(df_cat, aes_string("value", y_var)) +
geom_point(color = "orange", size = 3) +
facet_wrap(~ name, scales = "free_x") +
scale_y_log10() +
xlab(element_blank()) +
ggtitle("Mean churn per categorical level")df_num <- df %>%
select(all_of(x_continuous), all_of(y_var)) %>%
pivot_longer(cols = -all_of(y_var))
ggplot(df_num, aes_string("value", y = y_var)) +
geom_smooth() +
facet_wrap(~ name, scales = "free") +
xlab(element_blank())Next, we split our dataset into training and test/validation data, stratified by the response.
set.seed(34621L)
inds <- partition(df$Churn, p = c(train = 0.75, test = 0.25))
train <- df[inds$train, ]
test <- df[inds$test, ]
y_train <- train[[y_var]]
y_test <- test[[y_var]]
df <- df %>%
mutate(dataset = factor(c("test"), c("train", "test")))
df$dataset[inds$train] <- "train"
df %>%
group_by(dataset) %>%
summarise(mean(Churn), .groups="drop")trivial_predict <- function(X) {
mean(y_train) * rep(1, dim(X)[1])
}form <- reformulate(x_vars, y_var)
fit_glm <- glm(form, data = train, family = binomial())
summary(fit_glm)##
## Call:
## glm(formula = form, family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1015 -0.6929 -0.2972 0.6361 3.1984
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7490537 0.9568960 2.873 0.00407 **
## tenure -0.0004815 0.0041139 -0.117 0.90683
## MonthlyCharges -0.0078254 0.0362538 -0.216 0.82910
## LogTotalCharges -0.5921601 0.0595243 -9.948 < 2e-16 ***
## genderMale -0.0342426 0.0748712 -0.457 0.64742
## PartnerYes 0.0435868 0.0900245 0.484 0.62827
## DependentsYes -0.1256488 0.1035362 -1.214 0.22491
## PhoneServiceYes -0.1201122 0.7424339 -0.162 0.87148
## MultipleLinesYes 0.4486445 0.2024268 2.216 0.02667 *
## InternetServiceFiber optic 1.3643974 0.9119496 1.496 0.13462
## InternetServiceNo -1.5244306 0.9250945 -1.648 0.09938 .
## OnlineSecurityYes -0.2069737 0.2039200 -1.015 0.31012
## OnlineBackupYes -0.0101351 0.2002946 -0.051 0.95964
## DeviceProtectionYes 0.1374344 0.2011316 0.683 0.49441
## TechSupportYes -0.1697132 0.2058273 -0.825 0.40963
## StreamingTVYes 0.3873036 0.3734791 1.037 0.29973
## StreamingMoviesYes 0.4607985 0.3726992 1.236 0.21632
## ContractOne year -0.7008268 0.1236184 -5.669 1.43e-08 ***
## ContractTwo year -1.7527819 0.2059230 -8.512 < 2e-16 ***
## PaperlessBillingYes 0.3720130 0.0861707 4.317 1.58e-05 ***
## PaymentMethodCredit card (automatic) -0.1165994 0.1286154 -0.907 0.36463
## PaymentMethodElectronic check 0.1808147 0.1089242 1.660 0.09691 .
## PaymentMethodMailed check -0.1469906 0.1342096 -1.095 0.27342
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6110.3 on 5280 degrees of freedom
## Residual deviance: 4388.3 on 5258 degrees of freedom
## AIC: 4434.3
##
## Number of Fisher Scoring iterations: 6
r_squared_bernoulli(
y_test,
predict(fit_glm, test, type = "response"),
reference_mean = mean(y_train)
)## [1] 0.3399342
fit_rf <- ranger(
form,
data = train,
probability = TRUE,
seed = 774,
min.node.size = 30,
oob.error = TRUE
)
fit_rf## Ranger result
##
## Call:
## ranger(form, data = train, probability = TRUE, seed = 774, min.node.size = 30, oob.error = TRUE)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 5281
## Number of independent variables: 18
## Mtry: 4
## Target node size: 30
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1395934
Resonable choices for XGBoost’s hyperparameters are provided in the grid search table stored as “grid/c_grid_xgb.RData”. Thus, there is no need for retuning those parameters.
# Data interface
dtrain <- xgb.DMatrix(data.matrix(train[, x_vars]), label = y_train)
# Settings
tune <- FALSE
file_grid <- "grid/c_grid_xgb.RData"
if (tune) {
# Step 1: find good learning rate
xgb.cv(
list(learning_rate = 0.01),
dtrain,
nrounds = 5000,
nfold = 5,
objective = "binary:logistic",
showsd = FALSE,
early_stopping_rounds = 20,
verbose = 2
)
# Step 2: Grid search CV on typical parameter combos
paramGrid <- expand.grid(
iteration = NA,
score = NA,
learning_rate = 0.01,
max_depth = 3:6,
min_child_weight = c(0, 1e-04),
colsample_bynode = c(0.8, 1),
subsample = c(0.8, 1),
reg_lambda = 0:2,
reg_alpha = 0:2,
eval_metric = "logloss"
)
if (nrow(paramGrid) > 20) {
set.seed(342267)
paramGrid <- paramGrid[sample(nrow(paramGrid), 20), ]
}
for (i in seq_len(nrow(paramGrid))) {
print(i)
cvm <- xgb.cv(
as.list(paramGrid[i, -(1:2)]),
dtrain,
nrounds = 5000,
nfold = 5,
objective = "binary:logistic",
showsd = FALSE,
early_stopping_rounds = 20,
verbose = 0
)
paramGrid[i, 1] <- bi <- cvm$best_iteration
paramGrid[i, 2] <- cvm$evaluation_log[bi, "test_logloss_mean"] %>%
as.numeric()
save(paramGrid, file = file_grid)
}
}
load(file_grid, verbose = TRUE)## Loading objects:
## paramGrid
# Step 3: Fit on best params
head(paramGrid <- paramGrid[order(paramGrid$score), ])params <- paramGrid[1, ]
set.seed(76)
fit_xgb <- xgb.train(
as.list(params[, -(1:2)]),
data = dtrain,
nrounds = params$iteration,
objective = "binary:logistic"
)
# Predict wrapper for xgb model
xgb_predict <- function(newdata, x = x_vars) {
predict(fit_xgb, data.matrix(newdata[, x]))
}
# feature importance
importanceRaw <- xgb.importance(model = fit_xgb)
xgb.plot.importance(importance_matrix = importanceRaw)model_levels <- c("trivial", "logreg", "rf", "xgb")
df_cali <- df %>%
mutate(
prediction_trivial = trivial_predict(.),
prediction_logreg = predict(fit_glm, ., type = "response"),
prediction_rf = predict(fit_rf, .)$predictions[, 2],
prediction_xgb = xgb_predict(.)
) %>%
pivot_longer(
cols = starts_with("prediction"),
names_to = "model",
names_prefix = "prediction_",
values_to = "prediction"
) %>%
mutate(
bias = prediction - Churn,
model = factor(model, model_levels)
)
# Unconditional calibration
df_cali %>%
group_by(dataset, model) %>%
summarise(Mean_bias = mean(bias), .groups = "drop") %>%
mutate(Mean_bias = zapsmall(Mean_bias)) %>%
pivot_wider(
id_cols = "model",
names_from = "dataset",
values_from = "Mean_bias"
)reldiag <- reliabilitydiag(
logreg = filter(df_cali, model == "logreg", dataset == "test")$prediction,
rf = filter(df_cali, model == "rf", dataset == "test")$prediction,
xgb = filter(df_cali, model == "xgb", dataset == "test")$prediction,
y = y_test,
xtype = "continuous",
region.level = NA
)
# Get decomposition of log loss score => does not work
log_loss_score <- function(obs, pred){
ifelse(((obs==0 & pred==0) | (obs==1 & pred==1)),
0,
-obs * log(pred) - (1 - obs) * log(1 - pred))
}
# Score decomposition of log loss
log_loss_decomp_text <- summary(reldiag, score = "log_loss_score") %>%
mutate(
text = paste0(
"MCB = ", format(miscalibration, digits = 3), "\n",
"DSC = ", format(discrimination, digits = 3), "\n",
"UNC = ", format(uncertainty, digits = 3)
)
)
autoplot(
reldiag,
params_CEPline = list(size = 0.5),
params_histogram = list(colour = "black", fill = NA)
) +
facet_wrap("forecast") +
ggtitle("Reliability diagrams (test set)") +
guides(
color = guide_legend(title = "model"),
linetype = guide_legend(title = "model")
) +
geom_label(
data = log_loss_decomp_text,
mapping = aes(x = 0.0, y = 1, label = text),
size = 3,
hjust = "left",
vjust = "top"
)# Function returns two relevant mean Gamma deviance and corresponding D2
# Note: Bernoulli deviance = 2 * logloss
perf <- function(X, actual, predicted, ref) {
act <- X[[actual]]
pred <- X[[predicted]]
tibble(
Score = c(
`log loss` = logLoss(act, pred),
`D-Squared` = r_squared_bernoulli(act, pred, reference_mean = ref),
AUC = AUC(act, pred)
),
) %>%
mutate(Measure = factor(names(Score), names(Score)))
}
# Apply it to combinations of dataset and model
df_perf <- df_cali %>%
group_by(dataset, model) %>%
summarize(
perf(cur_data(), y_var, "prediction", mean(y_train)),
.groups = "drop"
)
df_perf_for_text <- df_perf %>%
mutate(Score = num(Score, digits = 3)) %>%
pivot_wider(
"model",
names_from = c("Measure", "dataset"),
values_from = "Score",
names_sort = TRUE
)
df_perf %>%
filter(Measure != "AUC") %>%
ggplot(aes(y = Score, x = dataset, color = model, group = model)) +
geom_point(size = 2) +
geom_line() +
facet_wrap(~ Measure, scales = "free")df_murphy <- df_cali %>%
filter(!(model %in% c("trivial", "ols"))) %>%
group_by(dataset, model) %>%
summarize(
murphy_diagram(
cur_data()[[y_var]],
cur_data()$prediction,
plot = FALSE,
theta = seq(0, 1, by = 0.02)
),
.groups = "drop"
) %>%
rename(Score = predicted)
# Vertical line is the mean y on the training data (for both graphs)
ggplot(df_murphy, aes(y = Score, x = theta)) +
geom_line(aes(color = model, linetype = model, group = model), size = 0.75) +
geom_vline(xintercept = mean(train[[y_var]]), linetype = 2, size = 0.75) +
facet_wrap(~ dataset, scales = "free", ncol = 2) +
theme(legend.position = c(0.25, 0.2), legend.title = element_blank())The html is generated with the follow packages (slightly newer than the ones used in the published tutorial).
sessionInfo()## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] splines stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] reliabilitydiag_0.2.0 splitTools_0.3.1 ranger_0.12.1
## [4] OpenML_1.10 withr_2.4.2 glm2_1.2.1
## [7] scales_1.1.1 patchwork_1.1.1 ggcorrplot_0.1.3
## [10] arrow_4.0.1 xgboost_1.4.1.1 MetricsWeighted_0.5.3
## [13] hexbin_1.28.2 lubridate_1.7.10 forcats_0.5.1
## [16] stringr_1.4.0 dplyr_1.0.6 purrr_0.3.4
## [19] readr_1.4.0 tidyr_1.1.3 tibble_3.1.2
## [22] ggplot2_3.3.3 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-152 fs_1.5.0 bit64_4.0.5 httr_1.4.2
## [5] tools_4.0.4 backports_1.2.1 bslib_0.3.1 utf8_1.2.1
## [9] R6_2.5.0 DBI_1.1.1 mgcv_1.8-36 colorspace_2.0-1
## [13] tidyselect_1.1.1 bit_4.0.4 curl_4.3.1 compiler_4.0.4
## [17] cli_2.5.0 rvest_1.0.0 xml2_1.3.2 labeling_0.4.2
## [21] sass_0.4.0 checkmate_2.0.0 digest_0.6.27 rmarkdown_2.8
## [25] pkgconfig_2.0.3 htmltools_0.5.2 dbplyr_2.1.1 fastmap_1.1.0
## [29] highr_0.9 rlang_0.4.11 readxl_1.3.1 rstudioapi_0.13
## [33] BBmisc_1.11 jquerylib_0.1.4 generics_0.1.0 farver_2.1.0
## [37] jsonlite_1.7.2 magrittr_2.0.1 Matrix_1.3-4 Rcpp_1.0.7
## [41] munsell_0.5.0 fansi_0.5.0 lifecycle_1.0.0 stringi_1.6.2
## [45] yaml_2.2.1 plyr_1.8.6 grid_4.0.4 crayon_1.4.1
## [49] lattice_0.20-44 haven_2.4.1 hms_1.1.0 knitr_1.33
## [53] pillar_1.6.1 reshape2_1.4.4 reprex_2.0.0 XML_3.99-0.6
## [57] glue_1.4.2 evaluate_0.14 data.table_1.14.0 modelr_0.1.8
## [61] vctrs_0.3.8 cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
## [65] cachem_1.0.5 xfun_0.23 broom_0.7.7 farff_1.1.1
## [69] viridisLite_0.4.0 memoise_2.0.0 ellipsis_0.3.2